In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import plotly.express as px
In [3]:
#import datasets
File = pd.read_csv(r"C:\Users\AMIT KUMAR\Downloads\Unemployment in India (1).csv",header=0)
In [4]:
#View first 10 rows
File.head(10)
Out[4]:
| Region | Date | Frequency | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 31-05-2019 | Monthly | 3.65 | 11999139.0 | 43.24 | Rural |
| 1 | Andhra Pradesh | 30-06-2019 | Monthly | 3.05 | 11755881.0 | 42.05 | Rural |
| 2 | Andhra Pradesh | 31-07-2019 | Monthly | 3.75 | 12086707.0 | 43.50 | Rural |
| 3 | Andhra Pradesh | 31-08-2019 | Monthly | 3.32 | 12285693.0 | 43.97 | Rural |
| 4 | Andhra Pradesh | 30-09-2019 | Monthly | 5.17 | 12256762.0 | 44.68 | Rural |
| 5 | Andhra Pradesh | 31-10-2019 | Monthly | 3.52 | 12017412.0 | 43.01 | Rural |
| 6 | Andhra Pradesh | 30-11-2019 | Monthly | 4.12 | 11397681.0 | 41.00 | Rural |
| 7 | Andhra Pradesh | 31-12-2019 | Monthly | 4.38 | 12528395.0 | 45.14 | Rural |
| 8 | Andhra Pradesh | 31-01-2020 | Monthly | 4.84 | 12016676.0 | 43.46 | Rural |
| 9 | Andhra Pradesh | 29-02-2020 | Monthly | 5.91 | 11723617.0 | 42.83 | Rural |
In [5]:
#Display the dataframes
File.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Region 740 non-null object 1 Date 740 non-null object 2 Frequency 740 non-null object 3 Estimated Unemployment Rate (%) 740 non-null float64 4 Estimated Employed 740 non-null float64 5 Estimated Labour Participation Rate (%) 740 non-null float64 6 Area 740 non-null object dtypes: float64(3), object(4) memory usage: 42.1+ KB
In [6]:
#Check for missing or null values
File.isna().sum()
Out[6]:
Region 28 Date 28 Frequency 28 Estimated Unemployment Rate (%) 28 Estimated Employed 28 Estimated Labour Participation Rate (%) 28 Area 28 dtype: int64
In [7]:
#dropping missing values
File = File.dropna()
File.isna().sum()
Out[7]:
Region 0 Date 0 Frequency 0 Estimated Unemployment Rate (%) 0 Estimated Employed 0 Estimated Labour Participation Rate (%) 0 Area 0 dtype: int64
In [8]:
#checking columns
File.columns
Out[8]:
Index(['Region', ' Date', ' Frequency', ' Estimated Unemployment Rate (%)',
' Estimated Employed', ' Estimated Labour Participation Rate (%)',
'Area'],
dtype='object')
In [9]:
#stripping the spaces infront of the columns
File.columns = File.columns.str.strip()
In [10]:
File.columns
Out[10]:
Index(['Region', 'Date', 'Frequency', 'Estimated Unemployment Rate (%)',
'Estimated Employed', 'Estimated Labour Participation Rate (%)',
'Area'],
dtype='object')
In [11]:
#checking for duplicates
File.duplicated().sum()
Out[11]:
0
In [12]:
#checking unique values
File["Frequency"].unique()
Out[12]:
array([' Monthly', 'Monthly'], dtype=object)
In [13]:
# Strip the empty space to get 1 unique value
File["Frequency"] = File["Frequency"].str.strip()
File["Frequency"].unique()
Out[13]:
array(['Monthly'], dtype=object)
In [14]:
# Check unique values for the date column
File["Date"].unique()
Out[14]:
array([' 31-05-2019', ' 30-06-2019', ' 31-07-2019', ' 31-08-2019',
' 30-09-2019', ' 31-10-2019', ' 30-11-2019', ' 31-12-2019',
' 31-01-2020', ' 29-02-2020', ' 31-03-2020', ' 30-04-2020',
' 31-05-2020', ' 30-06-2020'], dtype=object)
In [15]:
# Strip the empty space to get 1 unique value
File["Date"] = File["Date"].str.strip()
File["Date"].unique()
Out[15]:
array(['31-05-2019', '30-06-2019', '31-07-2019', '31-08-2019',
'30-09-2019', '31-10-2019', '30-11-2019', '31-12-2019',
'31-01-2020', '29-02-2020', '31-03-2020', '30-04-2020',
'31-05-2020', '30-06-2020'], dtype=object)
In [17]:
# Dropping the "Frequency" column
files = File.drop("Frequency", axis=1)
In [18]:
# Changing the date datatype from object to datetime
files['Date'] = pd.to_datetime(files["Date"], format="%d-%m-%Y")
In [19]:
# Create year-month column for aggregation
files['Year_Month'] = files['Date'].dt.strftime('%Y-%m')
In [20]:
# Create Estimated Employed Rate column
files['Estimated Employed Rate (%)'] = (files['Estimated Labour Participation Rate (%)'] - files['Estimated Unemployment Rate (%)'])
files
Out[20]:
| Region | Date | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | Year_Month | Estimated Employed Rate (%) | |
|---|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 2019-05-31 | 3.65 | 11999139.0 | 43.24 | Rural | 2019-05 | 39.59 |
| 1 | Andhra Pradesh | 2019-06-30 | 3.05 | 11755881.0 | 42.05 | Rural | 2019-06 | 39.00 |
| 2 | Andhra Pradesh | 2019-07-31 | 3.75 | 12086707.0 | 43.50 | Rural | 2019-07 | 39.75 |
| 3 | Andhra Pradesh | 2019-08-31 | 3.32 | 12285693.0 | 43.97 | Rural | 2019-08 | 40.65 |
| 4 | Andhra Pradesh | 2019-09-30 | 5.17 | 12256762.0 | 44.68 | Rural | 2019-09 | 39.51 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 749 | West Bengal | 2020-02-29 | 7.55 | 10871168.0 | 44.09 | Urban | 2020-02 | 36.54 |
| 750 | West Bengal | 2020-03-31 | 6.67 | 10806105.0 | 43.34 | Urban | 2020-03 | 36.67 |
| 751 | West Bengal | 2020-04-30 | 15.63 | 9299466.0 | 41.20 | Urban | 2020-04 | 25.57 |
| 752 | West Bengal | 2020-05-31 | 15.22 | 9240903.0 | 40.67 | Urban | 2020-05 | 25.45 |
| 753 | West Bengal | 2020-06-30 | 9.86 | 9088931.0 | 37.57 | Urban | 2020-06 | 27.71 |
740 rows × 8 columns
In [21]:
# since we have a Year-Month column, we dropped the date
files = files.drop("Date", axis=1)
files
Out[21]:
| Region | Estimated Unemployment Rate (%) | Estimated Employed | Estimated Labour Participation Rate (%) | Area | Year_Month | Estimated Employed Rate (%) | |
|---|---|---|---|---|---|---|---|
| 0 | Andhra Pradesh | 3.65 | 11999139.0 | 43.24 | Rural | 2019-05 | 39.59 |
| 1 | Andhra Pradesh | 3.05 | 11755881.0 | 42.05 | Rural | 2019-06 | 39.00 |
| 2 | Andhra Pradesh | 3.75 | 12086707.0 | 43.50 | Rural | 2019-07 | 39.75 |
| 3 | Andhra Pradesh | 3.32 | 12285693.0 | 43.97 | Rural | 2019-08 | 40.65 |
| 4 | Andhra Pradesh | 5.17 | 12256762.0 | 44.68 | Rural | 2019-09 | 39.51 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 749 | West Bengal | 7.55 | 10871168.0 | 44.09 | Urban | 2020-02 | 36.54 |
| 750 | West Bengal | 6.67 | 10806105.0 | 43.34 | Urban | 2020-03 | 36.67 |
| 751 | West Bengal | 15.63 | 9299466.0 | 41.20 | Urban | 2020-04 | 25.57 |
| 752 | West Bengal | 15.22 | 9240903.0 | 40.67 | Urban | 2020-05 | 25.45 |
| 753 | West Bengal | 9.86 | 9088931.0 | 37.57 | Urban | 2020-06 | 27.71 |
740 rows × 7 columns
In [23]:
import matplotlib.pyplot as plt
In [24]:
# Calculate average unemployment rate by year-month
unemployment_trend = files.groupby('Year_Month')['Estimated Unemployment Rate (%)'].mean().reset_index()
# Create the visualization
plt.figure(figsize=(12, 6))
plt.plot(unemployment_trend['Year_Month'], unemployment_trend['Estimated Unemployment Rate (%)'], marker='o')
plt.xticks(rotation=45)
plt.title('Average Unemployment Rate Trend in India')
plt.xlabel('Year_Month')
plt.ylabel('Unemployment Rate (%)')
plt.grid(True)
plt.tight_layout()
plt.show()
print("\
average unemployment rate:")
print(unemployment_trend)
average unemployment rate: Year_Month Estimated Unemployment Rate (%) 0 2019-05 8.874259 1 2019-06 9.303333 2 2019-07 9.033889 3 2019-08 9.637925 4 2019-09 9.051731 5 2019-10 9.900909 6 2019-11 9.868364 7 2019-12 9.497358 8 2020-01 9.950755 9 2020-02 9.964717 10 2020-03 10.700577 11 2020-04 23.641569 12 2020-05 24.875294 13 2020-06 11.903600
In [25]:
plt.figure(figsize=(13, 9))
# Calculate average unemployment rate by Region
Region_umemployment_rate = files.groupby('Region')['Estimated Unemployment Rate (%)'].mean().reset_index()
# Creating a column chart
sns.barplot(x=Region_umemployment_rate['Region'], y=Region_umemployment_rate['Estimated Unemployment Rate (%)'])
# Adding title and labels
plt.title('Average Unemployment Rate by Region')
plt.xlabel('Region')
plt.ylabel('Estimated Unemployment Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7) # Add horizontal gridlines
# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right') # Rotate by 45 degrees and align to the right
# Adjusting layout and displaying the plot
plt.tight_layout()
plt.show()
print("\
Average Unemployment Rate by Region:")
print(Region_umemployment_rate)
Average Unemployment Rate by Region:
Region Estimated Unemployment Rate (%)
0 Andhra Pradesh 7.477143
1 Assam 6.428077
2 Bihar 18.918214
3 Chandigarh 15.991667
4 Chhattisgarh 9.240357
5 Delhi 16.495357
6 Goa 9.274167
7 Gujarat 6.663929
8 Haryana 26.283214
9 Himachal Pradesh 18.540357
10 Jammu & Kashmir 16.188571
11 Jharkhand 20.585000
12 Karnataka 6.676071
13 Kerala 10.123929
14 Madhya Pradesh 7.406429
15 Maharashtra 7.557500
16 Meghalaya 4.798889
17 Odisha 5.657857
18 Puducherry 10.215000
19 Punjab 12.031071
20 Rajasthan 14.058214
21 Sikkim 7.249412
22 Tamil Nadu 9.284286
23 Telangana 7.737857
24 Tripura 28.350357
25 Uttar Pradesh 12.551429
26 Uttarakhand 6.582963
27 West Bengal 8.124643
In [26]:
# Calculate Area Estimated Unemployment total
urban = files.groupby('Area')['Estimated Unemployment Rate (%)'].sum().reset_index()
fig = px.pie(urban,names="Area",values='Estimated Unemployment Rate (%)',
color_discrete_sequence=["BLACK","GREEN"],title="Rural-Urban percentage Uemployment rate", hole=0.5)
# Set the size of the chart
fig.update_layout(width=1200, height=600)
fig.show()
print("\
Area Estimated Unemployment total (%):")
print(urban)
Area Estimated Unemployment total (%):
Area Estimated Unemployment Rate (%)
0 Rural 3706.60
1 Urban 5016.48
In [27]:
plt.figure(figsize=(13, 9))
# Calculate average Estimated Labour Participation Rate (%) by region
Region_Labour_Participation_Rate = files.groupby('Region')['Estimated Labour Participation Rate (%)'].mean().reset_index()
# Creating a column chart
sns.barplot(x=Region_Labour_Participation_Rate['Region'], y=Region_Labour_Participation_Rate['Estimated Labour Participation Rate (%)'])
# Adding title and labels
plt.title(' Estimated Labour Participation Rate (%) by Region')
plt.xlabel('Region')
plt.ylabel('Estimated Labour Participation Rate (%)')
plt.grid(axis='y', linestyle='--', alpha=0.7) # Add horizontal gridlines
# Rotate the x-axis labels for better readability
plt.xticks(rotation=45, ha='right') # Rotate by 45 degrees and align to the right
# Adjusting layout and displaying the plot
plt.tight_layout()
plt.show()
# Display cluster sizes and centers
print("\
Data Quality Check:")
print("Region_Labour_Participation_Rate:")
print(Region_Labour_Participation_Rate)
Data Quality Check:
Region_Labour_Participation_Rate:
Region Estimated Labour Participation Rate (%)
0 Andhra Pradesh 39.375714
1 Assam 44.868462
2 Bihar 38.153929
3 Chandigarh 39.336667
4 Chhattisgarh 42.810714
5 Delhi 38.929643
6 Goa 39.249583
7 Gujarat 46.101071
8 Haryana 42.737143
9 Himachal Pradesh 44.222143
10 Jammu & Kashmir 41.030952
11 Jharkhand 41.670714
12 Karnataka 41.345357
13 Kerala 34.867857
14 Madhya Pradesh 38.821429
15 Maharashtra 42.303214
16 Meghalaya 57.080741
17 Odisha 38.926429
18 Puducherry 38.992692
19 Punjab 41.138214
20 Rajasthan 39.973214
21 Sikkim 46.070000
22 Tamil Nadu 40.872143
23 Telangana 53.002500
24 Tripura 61.823929
25 Uttar Pradesh 39.432500
26 Uttarakhand 33.775556
27 West Bengal 45.417500
In [28]:
# Calculate Estimated_Employed By Region
plt.figure(figsize=(12, 6))
Estimated_Employed_trend = files.groupby('Region')['Estimated Employed'].mean().reset_index()
sns.barplot(x = Estimated_Employed_trend['Estimated Employed'], y = Estimated_Employed_trend['Region'])
plt.title('Estimated_Employed By Region')
plt.xlabel('Estimated Employed')
plt.tight_layout()
plt.show()
print("\
Estimated_Employed By Region:")
print(Estimated_Employed_trend)
Estimated_Employed By Region:
Region Estimated Employed
0 Andhra Pradesh 8.154093e+06
1 Assam 5.354772e+06
2 Bihar 1.236619e+07
3 Chandigarh 3.168312e+05
4 Chhattisgarh 4.303499e+06
5 Delhi 2.627513e+06
6 Goa 2.263083e+05
7 Gujarat 1.140201e+07
8 Haryana 3.557072e+06
9 Himachal Pradesh 1.059824e+06
10 Jammu & Kashmir 1.799932e+06
11 Jharkhand 4.469240e+06
12 Karnataka 1.066712e+07
13 Kerala 4.425900e+06
14 Madhya Pradesh 1.111548e+07
15 Maharashtra 1.999020e+07
16 Meghalaya 6.897368e+05
17 Odisha 6.545747e+06
18 Puducherry 2.122781e+05
19 Punjab 4.539362e+06
20 Rajasthan 1.004106e+07
21 Sikkim 1.068807e+05
22 Tamil Nadu 1.226955e+07
23 Telangana 7.939663e+06
24 Tripura 7.170026e+05
25 Uttar Pradesh 2.809483e+07
26 Uttarakhand 1.390228e+06
27 West Bengal 1.719854e+07
In [29]:
# 1. Sales Distribution by Product Category
plt.figure(figsize=(12, 6))
Labour_participation_by_area = files.groupby('Area')['Estimated Labour Participation Rate (%)'].mean().reset_index()
sns.barplot(x = Labour_participation_by_area['Estimated Labour Participation Rate (%)'], y = Labour_participation_by_area['Area'])
plt.title('Labour_participation_by_area')
plt.xlabel('Estimated Labour Participation Rate')
plt.tight_layout()
plt.show()
print("\
Labour_participation_by_area:")
print(Labour_participation_by_area)
Labour_participation_by_area:
Area Estimated Labour Participation Rate (%)
0 Rural 44.464819
1 Urban 40.901365
In [ ]:
In [ ]:
In [ ]: